1 Dataset for differential expression analysis

Expression analysis was performed on GEO dataset GSE15745 ‘Abundant Quantitative Trait Loci for CpG Methylation and Expression Across Human Brain Tissues’, which includes mRNA expression data for of 22,184 genes obtained from human brain tissues of 150 non-autistic individuals using Illumina humanRef-8 v2.0 expression beadchip. The study provides data from 4 different brain tissues: cerebellum (CRBLM), frontal cortex (FCTX), pons (PONS) and temporal cortex (TCTX). Samples originated from 3 distinct tissue banks: Baltimore Longitudinal Study of Aging (BLSA) from neuropathology department of John Hopkins hospital, non-BLSA samples from the same hospital (JHU) and NICHD brain and tissue bank for developmental disorders from University of Mariland medical school (UMARY). In addition, genome-wide SNP genotyping data of these subjects was obtained from dbGap, study accession phs000249.v1.p1, and included information for 561466 SNPs genotyped with Illumina HumanHap550v3.0 SNP array. Inversion genotypes were obtained with invClust

2 Differential expression analysis

First, let us load the ExpressionSet with the expression data and the table with the inversion genotypes from invClust

load('DE/brain.Rdata')
load('DE/inversions.RData')

Then, we perform normalization between arrays

exprsBrain <- exprs(brainExpr)
exprsBrain.norm <- log2(limma::normalizeBetweenArrays(exprsBrain))
brainExpr.norm <- brainExpr
exprs(brainExpr.norm) <- exprsBrain.norm
sampleNames(brainExpr.norm) <- colnames(exprsBrain)

pp <- pData(brainExpr.norm)
pp$id <- unlist(lapply(strsplit(as.character(pp$title), "-"), 
                       function(x) paste(x[1:2], collapse="-")))
pp2 <- left_join(pp, inversions, by=c("id"="SUBJID"))
rownames(pp2) <- pp2$geo_accession
pData(brainExpr.norm) <- pp2

And after that let us run differential expression analysis, one per region and inversion. Here we show the process for inv1_004

regions <- c("CRBLM", "FCTX", "PONS", "TCTX")
res <- list()
brainExpr.norm$inv <- additive(snp(pData(brainExpr.norm)[,'inv1_004']))
for (i in 1:4){
  ii <- grep(regions[i], pData(brainExpr.norm)$`source_name_ch1`)
  eSet <- brainExpr.norm[, ii]
  ans <- runPipeline(eSet, "inv", sva=TRUE)
  res[[i]] <- getProbeResults(ans, fNames = c("Symbol", "Chromosome", "Probe_Coordinates"))
}
names(res) <- regions

Finally, we extract the coordinates of each probe and keep those that overlap with the inversion

startstop <- function(x, pos){
  x <- as.character(x)
  if (x == ""){
    return(0)
  }
  else{
    coords <- c()
    splits <- strsplit(x, ':')[[1]]
    if (length(splits) == 1) {
      coords[1] <- strsplit(splits[1], '-')[[1]][1]
      coords[2] <- strsplit(splits[1], '-')[[1]][2]
    }
    else if (length(splits) > 1){
      coords[1] <- strsplit(splits[1], '-')[[1]][1]
      coords[2] <- strsplit(splits[length(splits)], '-')[[1]][2]
    }
    if (pos == 'start'){
      return(min(as.numeric(coords[1]), as.numeric(coords[2])))
    }
    else if (pos == 'stop'){
      return(max(as.numeric(coords[1]), as.numeric(coords[2])))
    }
  }
}

roi <- read.delim('DE/ROI.txt')
invinfo <- roi[roi$reg == 'inv1_004',]
invrange <- GRanges(seqnames = invinfo$chr, ranges = IRanges(start = invinfo$LBP-500000, end = invinfo$RBP+500000))
for (i in 1:4){
  res.region <- res[[i]]
  starts <- unlist(lapply(res.region$Probe_Coordinates, startstop, pos = 'start'))
  stops <- unlist(lapply(res.region$Probe_Coordinates, startstop, pos = 'stop'))
  ID <- res.region$Symbol
  chr <- res.region$Chromosome
  chr <- as.vector(chr)
  chr[chr == ""] <- 0
  chr <- as.factor(chr)
  rangesdf <- data.frame(ID, chr, starts, stops)
  granges <- makeGRangesFromDataFrame(rangesdf,
                                      keep.extra.columns=FALSE,
                                      ignore.strand=TRUE,
                                      seqinfo=NULL,
                                      seqnames.field="chr",
                                      start.field="starts",
                                      end.field="stops"
  )
  names(granges) <- rangesdf$ID
  genesinv <- names(subsetByOverlaps(granges, invrange))
  invres <- paste0(regions[i], '.', 'inv1_004')
  assign(invres, subset(res.region, Symbol %in% genesinv))
}

CRBLM.inv1_004$Region <- c('CRBLM')
FCTX.inv1_004$Region <- c('FCTX')
TCTX.inv1_004$Region <- c('TCTX')
PONS.inv1_004$Region <- c('PONS')
allres <- rbind(CRBLM.inv1_004, FCTX.inv1_004, TCTX.inv1_004, PONS.inv1_004)
allres <- allres[order(allres$P.Value),]

Significant genes in inv1_004 are

topres <- allres[allres$P.Value < 0.05, c(6, 10, 12, 13)]
topres
##                  P.Value  Symbol                   Probe_Coordinates Region
## ILMN_16577542 0.01440123 C1orf82                   92626087-92626136   TCTX
## ILMN_16564502 0.01467792    BRDT                   92190334-92190383   TCTX
## ILMN_17472852 0.01602019    HFM1                   91615678-91615727   TCTX
## ILMN_17418011 0.01968234    CDC7                   91763635-91763684   FCTX
## ILMN_16825773 0.03996710  FAM69A 93071130-93071164:93071165-93071179   PONS
## ILMN_16564503 0.04775801    BRDT                   92190334-92190383   PONS
## ILMN_17301182 0.04901165  ZNF644                   91153757-91153806   TCTX

For each of the significant genes we can make a boxplot (Figure 1 shows an example with C1orf82)

gene <- 'C1orf82'
id <- as.character(fData(brainExpr.norm)$ID[fData(brainExpr.norm)$Symbol == gene & fData(brainExpr.norm)$Probe_Coordinates == topres$Probe_Coordinates[topres$Symbol == gene]])
expr <- as.data.frame(exprs(brainExpr.norm)[id,], drop = F)
names(expr) <- 'Expression'
pdatasub <- pData(brainExpr.norm)[,c('source_name_ch1', 'inv1_004')]
names(pdatasub) <- c('Region', 'Inversion_Genotype')
plotdata <- merge(pdatasub, expr, by.x = 0, by.y = 0)
plotdata <- plotdata[!is.na(plotdata$Inversion_Genotype),]
plotdata$Region <- as.vector(plotdata$Region)
plotdata$Region[plotdata$Region == 'Human Brain Tissue: CRBLM'] <- 'CRBLM'
plotdata$Region[plotdata$Region == 'Human Brain Tissue: FCTX'] <- 'FCTX'
plotdata$Region[plotdata$Region == 'Human Brain Tissue: TCTX'] <- 'TCTX'
plotdata$Region[plotdata$Region == 'Human Brain Tissue: PONS'] <- 'PONS'
plotdata$Region <- as.factor(plotdata$Region)
pval <- data.frame(allres[c(allres$Symbol == gene & 
                            allres$Probe_Coordinates %in% topres$Probe_Coordinates), c(6, 10, 13)])
pval$x <- Inf
pval$y <- Inf
labs <- unlist(lapply(pval$`P.Value`, function(x) paste0('p-value = ', formatC(x, digits = 2, format = 'e'))))
pval$lab <- labs
ggplot(plotdata, aes(x=Inversion_Genotype, y=Expression, group = Inversion_Genotype)) + 
  theme(legend.position = 'none',
        plot.title = element_text(size = 20, family = 'Helvetica'),
        axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 10),
        axis.title.x = element_text(size = 12, margin = margin(t = 10, r = 0, b = 0, l = 0)),
        axis.title.y = element_text(size = 12, margin = margin(t = 0, r = 10, b = 0, l = 0))) +
  ggtitle(gene) +
  ylab('Normalized expression') +
  xlab('Inversion genotype') +
  geom_boxplot(aes(fill = Inversion_Genotype)) +
  scale_fill_manual(values = c('darkseagreen1', 'darkseagreen3', 'darkseagreen4')) +
  facet_grid(. ~ Region) +
  theme(strip.text.x = element_text(size = 15, face = 'bold', family = 'Helvetica'),
        plot.margin = margin(t = 15, r = 10, b = 5, l = 10)) +
  geom_label(data = data.frame(x = -Inf, y = Inf, lab = pval$lab, Region = pval$Region), aes(x, y, label = lab, group=NULL), hjust = 0, vjust = 1)
C1or82 brain expression between inversion inv1_004 genotypes

Figure 1: C1or82 brain expression between inversion inv1_004 genotypes

2.1 Boxplots of differential expression

Plots for all significative genes in all genotyped inversions (Figures 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 and 16)

2.2 inv1_004

Top genes for inv1_004

Figure 2: Top genes for inv1_004

2.3 inv2_013

Top genes for inv2_013

Figure 3: Top genes for inv2_013

2.4 inv3_003

Top genes for inv3_003

Figure 4: Top genes for inv3_003

2.5 inv6_002

Top genes for inv6_002

Figure 5: Top genes for inv6_002

2.6 inv6_006

Top genes for inv6_006

Figure 6: Top genes for inv6_006

2.7 inv7_005

Top genes for inv7_005

Figure 7: Top genes for inv7_005

2.8 inv7_011

Top genes for inv7_011

Figure 8: Top genes for inv7_011

2.9 inv8_001

Top genes for inv8_001

Figure 9: Top genes for inv8_001

2.10 inv10_005

Top genes for inv10_005

Figure 10: Top genes for inv10_005

2.11 inv11_004

Top genes for inv11_004

Figure 11: Top genes for inv11_004

2.12 inv12_004

Top genes for inv12_004

Figure 12: Top genes for inv12_004

2.13 inv12_006

Top genes for inv12_006

Figure 13: Top genes for inv12_006

2.14 inv14_005

Top genes for inv14_005

Figure 14: Top genes for inv14_005

2.15 inv16_009

Top genes for inv16_009

Figure 15: Top genes for inv16_009

2.16 inv17_007

Top genes for inv17_007

Figure 16: Top genes for inv17_007

3 Session info

sessionInfo()
## R version 4.0.0 (2020-04-24)
## Platform: x86_64-conda_cos6-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /home/isglobal.lan/lbalague/miniconda2/envs/brain/lib/libopenblasp-r0.3.9.so
## 
## locale:
##  [1] LC_CTYPE=es_ES.UTF-8       LC_NUMERIC=C               LC_TIME=es_ES.UTF-8        LC_COLLATE=es_ES.UTF-8     LC_MONETARY=es_ES.UTF-8    LC_MESSAGES=es_ES.UTF-8    LC_PAPER=es_ES.UTF-8       LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] png_0.1-7            GenomicRanges_1.40.0 GenomeInfoDb_1.24.0  IRanges_2.22.2       S4Vectors_0.26.1     gridExtra_2.3        limma_3.44.1         SNPassoc_1.9-2       mvtnorm_1.1-1        survival_3.1-12      haplo.stats_1.7.9    MEAL_1.18.0          MultiDataSet_1.16.0  Biobase_2.48.0       BiocGenerics_0.34.0  invClust_1.0         forcats_0.5.0        stringr_1.4.0        dplyr_1.0.0          purrr_0.3.4          readr_1.3.1          tidyr_1.1.0          tibble_3.0.1         ggplot2_3.3.1        tidyverse_1.3.0      BiocStyle_2.16.0    
## 
## loaded via a namespace (and not attached):
##   [1] rms_6.0-0                   tidyselect_1.1.0            RSQLite_2.2.0               AnnotationDbi_1.50.0        htmlwidgets_1.5.1           grid_4.0.0                  BiocParallel_1.22.0         munsell_0.5.0               codetools_0.2-16            preprocessCore_1.50.0       withr_2.2.0                 fastICA_1.2-2               colorspace_1.4-1            highr_0.8                   knitr_1.28                  rstudioapi_0.11             labeling_0.3                JADE_2.0-3                  isva_1.9                    GenomeInfoDbData_1.2.3      farver_2.0.3                bit64_0.9-7                 rhdf5_2.32.0                TH.data_1.0-10              vctrs_0.3.1                 generics_0.0.2              xfun_0.14                   qqman_0.1.4                 BiocFileCache_1.12.0        R6_2.4.1                    clue_0.3-57                 illuminaio_0.30.0           locfit_1.5-9.4              bitops_1.0-6                reshape_0.8.8              
##  [36] DelayedArray_0.14.0         assertthat_0.2.1            scales_1.1.1                multcomp_1.4-13             nnet_7.3-14                 gtable_0.3.0                sva_3.36.0                  sandwich_2.5-1              MatrixModels_0.4-1          rlang_0.4.6                 genefilter_1.70.0           calibrate_1.7.5             splines_4.0.0               rtracklayer_1.48.0          acepack_1.4.1               GEOquery_2.56.0             broom_0.5.6                 checkmate_2.0.0             reshape2_1.4.4              BiocManager_1.30.10         yaml_2.2.1                  modelr_0.1.8                snpStats_1.38.0             GenomicFeatures_1.40.0      backports_1.1.7             qvalue_2.20.0               Hmisc_4.4-0                 tools_4.0.0                 bookdown_0.19               nor1mix_1.3-0               ellipsis_0.3.1              RColorBrewer_1.1-2          siggenes_1.62.0             Rcpp_1.0.4.6                plyr_1.8.6                 
##  [71] base64enc_0.1-3             progress_1.2.2              zlibbioc_1.34.0             RCurl_1.98-1.2              prettyunits_1.1.1           rpart_4.1-15                openssl_1.4.1               bumphunter_1.30.0           zoo_1.8-8                   SummarizedExperiment_1.18.1 haven_2.3.1                 cluster_2.1.0               fs_1.4.1                    magrittr_1.5                magick_2.3                  RSpectra_0.16-0             data.table_1.12.8           SparseM_1.78                reprex_0.3.0                matrixStats_0.56.0          hms_0.5.3                   evaluate_0.14               xtable_1.8-4                XML_3.99-0.3                jpeg_0.1-8.1                mclust_5.4.6                readxl_1.3.1                compiler_4.0.0              biomaRt_2.44.0              minfi_1.34.0                crayon_1.3.4                htmltools_0.4.0             mgcv_1.8-31                 Formula_1.2-3               lubridate_1.7.8            
## [106] DBI_1.1.0                   dbplyr_1.4.4                MASS_7.3-51.6               rappdirs_0.3.1              Matrix_1.2-18               permute_0.9-5               cli_2.0.2                   quadprog_1.5-8              pkgconfig_2.0.3             GenomicAlignments_1.24.0    foreign_0.8-80              SmartSVA_0.1.3              xml2_1.3.2                  foreach_1.5.0               annotate_1.66.0             rngtools_1.5                multtest_2.44.0             beanplot_1.2                XVector_0.28.0              rvest_0.3.5                 doRNG_1.8.2                 scrime_1.3.5                digest_0.6.25               vegan_2.5-6                 Biostrings_2.56.0           rmarkdown_2.3               base64_2.0                  cellranger_1.1.0            htmlTable_1.13.3            edgeR_3.30.3                DelayedMatrixStats_1.10.0   curl_4.3                    quantreg_5.55               Rsamtools_2.4.0             lifecycle_0.2.0            
## [141] nlme_3.1-148                jsonlite_1.6.1              Rhdf5lib_1.10.0             askpass_1.1                 fansi_0.4.1                 pillar_1.4.4                lattice_0.20-41             httr_1.4.1                  glue_1.4.1                  iterators_1.0.12            bit_1.1-15.2                class_7.3-17                stringi_1.4.6               HDF5Array_1.16.0            blob_1.2.1                  polspline_1.1.19            latticeExtra_0.6-29         memoise_1.1.0               e1071_1.7-3